started: AL26Apr2019
last updated: AL01Sep2019
Inputs:
- WECARE-NFE only outliers: based on 214 variants x 715 samples ( 519BC = 258UBC + 260CBC and 197NFE)
- Joined WECARE-NFE-KGEN eigenvectors: 3,219 samples = 715 wecare-nfe + 2,504 kgen
A number of clear ethinic outliers have not been picked up by WECARE-NFE PCA.
So, some more outliers have been added manually, basing on the joined plot with 1KG,
expanding 26 to 46 outliers
Sys.time()
## [1] "2019-09-01 18:25:57 BST"
rm(list=ls())
graphics.off()
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s15_add_PCs_exclude_outliers"
opts_knit$set(root.dir = base_folder)
options(stringsAsFactors = F)
options(warnPartialMatchArgs = T,
warnPartialMatchAttr = T,
warnPartialMatchDollar = T)
#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588
Read data for joined PCA on Ampliseq and 1KG
# Sequencing data (for wecare phenotypes: case/control status)
source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s12_check_BRCA1_BRCA2_PALB2"
load(paste(source_folder, "s03_exclude_BRCA1_BCRA2_PALB2_carriers.RData", sep="/"))
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s15_add_PCs_exclude_outliers"
source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s14_explore_wecare_1kg_PCA_plots/s03_all_variants_not_in_LD_971"
# Eigenvectors
eigenvectors_file <- paste(source_folder, "ampliseq_1kg_971_3219_100PCs.eigenvec", sep="/")
eigenvectors.df <- read.table(eigenvectors_file, header=T, sep="\t",quote="")
# 1kg phenotypes (ethnicity and gender)
kg_phenotypes_file <- paste(source_folder, "integrated_call_samples_v3.20130502.ALL.panel", sep="/")
kg_phenotypes.df <- read.table(kg_phenotypes_file, header=T)
# Clean-up
rm(source_folder, eigenvectors_file, kg_phenotypes_file, genotypes.mx, variants.df)
# List of objects
ls()
## [1] "base_folder" "eigenvectors.df" "kg_phenotypes.df" "phenotypes.df"
# Expected number of samples in eigenvectors
2504+715
## [1] 3219
# Dimentions of objects
dim(eigenvectors.df)
## [1] 3219 102
dim(kg_phenotypes.df)
## [1] 2504 4
dim(phenotypes.df)
## [1] 715 24
# Update eigenvectors
rownames(eigenvectors.df) <- eigenvectors.df$FID
eigenvectors.df <- eigenvectors.df[,-1]
# Make a table with IDs of overlapping NFE
eigenvectors.df[c(3022,3023),c("IID","PC1")]
## IID PC1
## 9_S346_L008 9_S346_L008 -0.01348900
## 2:HG00097 2:HG00097 -0.00844256
nfe_pca <- eigenvectors.df$IID[3023:3219] # re-processed NFE added to Ampliseq
nfe_ampliseq <- sub("2:","",nfe_pca)
nfe.df <- data.frame(nfe_ampliseq, nfe_pca)
# Remove overlapping NFE from ampliseq-kgen eigenvectors
selected_samples <- ! eigenvectors.df$IID %in% nfe.df$nfe_pca
sum(selected_samples)
## [1] 3022
518+2504
## [1] 3022
eigenvectors_ampliseq_kgen.df <- eigenvectors.df[selected_samples,1:6]
"sample" -> colnames(eigenvectors_ampliseq_kgen.df)[1]
# Prepare ampliseq phenotypes
# (for clarity of PCA plot wecare samples and controls were not separated)
phenotypes.df[c(518:519),c(1,2)]
## long_ids illumina_id
## 9_S346_L008 9_S346_L008 S346
## HG00097 HG00097 <NA>
phenotypes_ampliseq.df <- phenotypes.df[1:518,c("long_ids","cc")]
table(phenotypes_ampliseq.df$cc)
##
## 0 1
## 258 260
"WECARE" -> phenotypes_ampliseq.df$cc[phenotypes_ampliseq.df$cc==1] # This could be named as CBC
"WECARE" -> phenotypes_ampliseq.df$cc[phenotypes_ampliseq.df$cc==0] # This could be named as UBC
table(phenotypes_ampliseq.df$cc)
##
## WECARE
## 518
c("sample","group") -> colnames(phenotypes_ampliseq.df)
# Prepare kgen phenotypes
phenotypes_kgen.df <- kg_phenotypes.df[,c("sample","super_pop")]
c("sample","group") -> colnames(phenotypes_kgen.df)
# Merge kgen and Ampliseq phenotypes (latest will be on top in the plot)
phenotypes_ampliseq_kgen.df <- rbind(phenotypes_kgen.df,phenotypes_ampliseq.df)
table(phenotypes_ampliseq_kgen.df$group)
##
## AFR AMR EAS EUR SAS WECARE
## 661 347 504 503 489 518
# Add eigenvectors to phenotypes
dim(phenotypes_ampliseq_kgen.df)
## [1] 3022 2
dim(eigenvectors_ampliseq_kgen.df)
## [1] 3022 6
eigenphen_ampliseq_kgen.df <- full_join(
phenotypes_ampliseq_kgen.df, eigenvectors_ampliseq_kgen.df, by="sample")
dim(eigenphen_ampliseq_kgen.df)
## [1] 3022 7
head(eigenphen_ampliseq_kgen.df)
## sample group PC1 PC2 PC3 PC4 PC5
## 1 HG00096 EUR -0.01258860 -0.01642160 0.000925435 -0.001809850 0.00780321
## 2 HG00097 EUR -0.00802238 -0.00726931 0.003157220 -0.002825920 0.00838845
## 3 HG00099 EUR -0.01012120 -0.01259730 0.002215210 -0.006756090 0.03568230
## 4 HG00100 EUR -0.00849754 -0.01684940 0.002751330 -0.001556380 0.01011790
## 5 HG00101 EUR -0.01243290 -0.00932382 0.000159626 -0.005677620 0.00907268
## 6 HG00102 EUR -0.00827301 -0.00616160 0.001818280 0.000230261 -0.00955764
tail(eigenphen_ampliseq_kgen.df)
## sample group PC1 PC2 PC3 PC4 PC5
## 3017 95_S517_L008 WECARE -0.0125655 0.000696678 0.000508483 0.00301086 0.000758076
## 3018 96_S236_L007 WECARE -0.0103028 -0.009400760 0.000408240 -0.00126134 0.012597400
## 3019 97_S509_L008 WECARE -0.0116830 -0.016781000 0.003277310 -0.00472534 0.025380700
## 3020 98_S335_L008 WECARE -0.0108617 -0.011973100 -0.000350267 -0.00303147 0.006616230
## 3021 99_S418_L008 WECARE -0.0124556 -0.014528400 -0.002571390 -0.00271848 0.005434640
## 3022 9_S346_L008 WECARE -0.0134890 -0.012876200 0.004023890 0.00388655 -0.003655670
# Clean-up
rm(nfe_pca, nfe_ampliseq, selected_samples, eigenvectors_ampliseq_kgen.df,
phenotypes_ampliseq.df, phenotypes_ampliseq_kgen.df, nfe.df, phenotypes_kgen.df,
eigenvectors.df, kg_phenotypes.df, phenotypes.df)
# Read data with wecare-only PCa and outliers
load(paste(base_folder, "s03_add_PCs_214_715.RData", sep="/"))
# Extract data about previously selected outliers (based on wecare only PCs: mean +/- 3sd)
outliers.df <- phenotypes.df %>%
filter(pc_outlier) %>%
select(long_ids)
wecare_pc_outlier <- eigenphen_ampliseq_kgen.df$sample %in% outliers.df$long_ids
sum(wecare_pc_outlier)
## [1] 26
# Add column with outliers to the main table
eigenphen_ampliseq_kgen.df <- data.frame(eigenphen_ampliseq_kgen.df, wecare_pc_outlier)
# Clean up
rm(genotypes.mx, phenotypes.df, variants.df, outliers.df, wecare_pc_outlier)
http://www.sthda.com/english/wiki/ggplot2-point-shapes
# Set outliers thresholds (manually selected on the basis of visual assessment of plots)
pc1_th <- 0 # 0.005
pc2_th <- 0.005 # 0.01
# Prepare vector fr colour scale
myColours <- c("EUR"="BLUE", "AFR"="YELLOW", "AMR"="GREEN",
"SAS"="GREY", "EAS"="PINK",
"WECARE"="RED")
myColourScale <- scale_colour_manual(values=myColours)
# Static plot
ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) +
geom_point(aes(col=group, shape=wecare_pc_outlier)) +
labs(title="PC1 vs PC2", x="PC1", y="PC2") +
scale_shape_manual(values=c(16,4)) +
geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
myColourScale
# Interactive plot
plotly_group <- factor(eigenphen_ampliseq_kgen.df$group,
levels=c("AFR","AMR","EAS","SAS","EUR","WECARE"))
g <- ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) +
geom_point(aes(col=plotly_group, shape=wecare_pc_outlier, text=sample)) +
labs(title="PC1 vs PC2 (manual thresholds)", x="PC1", y="PC2") +
scale_shape_manual(values=c(16,4)) +
geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
myColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(myColours, myColourScale, plotly_group, g)
manual_outliers.df <- eigenphen_ampliseq_kgen.df %>%
filter(PC1 > pc1_th | PC2 > pc2_th, group=="WECARE") %>%
select(sample, wecare_pc_outlier)
dim(manual_outliers.df)
## [1] 46 2
manual_outliers.df
## sample wecare_pc_outlier
## 1 103_S147_L007 TRUE
## 2 11_S358_L008 FALSE
## 3 133_S168_L007 FALSE
## 4 139_S123_L007 FALSE
## 5 141_S158_L007 TRUE
## 6 146_S408_L008 FALSE
## 7 148_S432_L008 TRUE
## 8 16_S109_L007 TRUE
## 9 1_S441_L008 FALSE
## 10 235_S535_L008 TRUE
## 11 238_S520_L008 TRUE
## 12 246_S375_L008 TRUE
## 13 256_S513_L008 TRUE
## 14 267_S48_L007 TRUE
## 15 273_S150_L007 TRUE
## 16 275_S22_L007 TRUE
## 17 276_S209_L007 FALSE
## 18 277_S292_L008 TRUE
## 19 285_S374_L008 FALSE
## 20 287_S10_L007 TRUE
## 21 289_S69_L007 FALSE
## 22 293_S479_L008 TRUE
## 23 308_S434_L008 TRUE
## 24 311_S137_L007 TRUE
## 25 313_S362_L008 FALSE
## 26 323_S469_L008 FALSE
## 27 326_S317_L008 TRUE
## 28 329_S373_L008 TRUE
## 29 330_S409_L008 FALSE
## 30 347_S36_L007 TRUE
## 31 352_S435_L008 FALSE
## 32 355_S365_L008 FALSE
## 33 35_S9_L007 FALSE
## 34 366_S293_L008 TRUE
## 35 368_S46_L007 TRUE
## 36 369_S230_L007 TRUE
## 37 372_S340_L008 FALSE
## 38 375_S140_L007 FALSE
## 39 376_S84_L007 FALSE
## 40 385_S305_L008 FALSE
## 41 388_S259_L007 TRUE
## 42 3_S360_L008 TRUE
## 43 403_S210_L007 TRUE
## 44 407_S106_L007 FALSE
## 45 408_S130_L007 TRUE
## 46 94_S143_L007 FALSE
sum(manual_outliers.df$wecare_pc_outlier)
## [1] 26
save.image(paste(base_folder, "s04_explore_ethnicity_of_214_715_outliers.RData", sep="/"))
ls()
## [1] "base_folder" "eigenphen_ampliseq_kgen.df" "manual_outliers.df" "pc1_th" "pc2_th"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.9.0 ggplot2_3.2.0 dplyr_0.8.1 knitr_1.23
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 later_0.8.0 pillar_1.4.1 compiler_3.5.1 tools_3.5.1 digest_0.6.19 jsonlite_1.6 evaluate_0.14 tibble_2.1.3 gtable_0.3.0 viridisLite_0.3.0 pkgconfig_2.0.2 rlang_0.3.4 shiny_1.3.2 crosstalk_1.0.0 yaml_2.2.0 xfun_0.7 withr_2.1.2 stringr_1.4.0 httr_1.4.0 htmlwidgets_1.3 grid_3.5.1 tidyselect_0.2.5 glue_1.3.1 data.table_1.12.2 R6_2.4.0 rmarkdown_1.13 purrr_0.3.2 tidyr_0.8.3 magrittr_1.5 promises_1.0.1 scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1 xtable_1.8-4 mime_0.7 colorspace_1.4-1 httpuv_1.5.1 labeling_0.3 stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
Sys.time()
## [1] "2019-09-01 18:26:00 BST"